Code
from google.colab import drive
drive.mount('/content/drive')
import polars as plMounted at /content/drive
Reasons for Leaving School Early - Quarto Report
from google.colab import drive
drive.mount('/content/drive')
import polars as plMounted at /content/drive
pl.read_csv('drive/MyDrive/Colab Notebooks/unicef_metadata1.csv',
infer_schema_length=10000, # Increase infer_schema_length
schema_overrides={'Population, total': pl.Float64}, # Or, specify the correct dtype
)
meta = pl.read_csv('drive/MyDrive/Colab Notebooks/unicef_metadata1.csv',
infer_schema_length=10000,
schema_overrides={'Population, total': pl.Float64},
)pl.read_csv('drive/MyDrive/Colab Notebooks/Out-of-school rate for adolescents of lower secondary school age (administrative data).csv')
OOSR = pl.read_csv('drive/MyDrive/Colab Notebooks/Out-of-school rate for adolescents of lower secondary school age (administrative data).csv')Hover over any country to see its avg. obs value!
import polars as pl
import geopandas as gpd
import plotly.express as px
# Load the Excel file
df = OOSR
# Filter where sex == 'Total'
df_total = df.filter(pl.col("sex") == "Total")
# Group by country and compute average dropout rate
avg_dropout = df_total.group_by("country").agg(pl.col("obs_value").mean().alias("avg_dropout_rate"))
# Convert to pandas for Plotly
avg_df = avg_dropout.to_pandas()
# Create a choropleth map
fig = px.choropleth(
avg_df,
locations="country",
locationmode="country names",
color="avg_dropout_rate",
color_continuous_scale="Reds",
title="Average Dropout Rate per Country (Adolescents)"
)
fig.show()#Is anyone left behind?
!pip install --upgrade polars
!pip install country_converter
!pip install plotnine geopandas
from plotnine import *
import country_converter as coco
import polars as pl
gender_data = (OOSR
.filter(pl.col("sex") != "Total")
.group_by(["country","sex"])
.agg(pl.col("obs_value").median())
.pivot(values="obs_value",index="country",columns="sex"))
cc = coco.CountryConverter()
countries = gender_data.get_column("country").to_list()
continent_list = cc.convert(names=countries, to='continent', not_found=None)
gender_data_with_continent = gender_data.with_columns(
pl.Series(continent_list).alias("continent")
)
# Melt the DataFrame for plotting
melted_data = gender_data_with_continent.melt(id_vars=["country", "continent"], variable_name="sex", value_name="obs_value")
# Create the plot using plotnine
continent_medians = (
melted_data
.group_by(["continent", "sex"])
.agg(pl.col("obs_value").median().alias("median_obs_value"))
)
(ggplot(continent_medians, aes(x="continent", y="median_obs_value",fill="sex"))
+ geom_col(position="dodge")
+ labs(x="Continent", y="Median Out of School Rate", fill="Sex")
+ theme(axis_text_x=element_text(rotation=45))
+ theme_minimal()
+ scale_fill_manual(values=["purple","pink"])
)Requirement already satisfied: polars in /usr/local/lib/python3.11/dist-packages (1.27.1)
Requirement already satisfied: country_converter in /usr/local/lib/python3.11/dist-packages (1.3)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.11/dist-packages (from country_converter) (2.2.2)
Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2.0.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=1.0->country_converter) (2025.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas>=1.0->country_converter) (1.17.0)
Requirement already satisfied: plotnine in /usr/local/lib/python3.11/dist-packages (0.14.5)
Requirement already satisfied: geopandas in /usr/local/lib/python3.11/dist-packages (1.0.1)
Requirement already satisfied: matplotlib>=3.8.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (3.10.0)
Requirement already satisfied: pandas>=2.2.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (2.2.2)
Requirement already satisfied: mizani~=0.13.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (0.13.3)
Requirement already satisfied: numpy>=1.23.5 in /usr/local/lib/python3.11/dist-packages (from plotnine) (2.0.2)
Requirement already satisfied: scipy>=1.8.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (1.14.1)
Requirement already satisfied: statsmodels>=0.14.0 in /usr/local/lib/python3.11/dist-packages (from plotnine) (0.14.4)
Requirement already satisfied: pyogrio>=0.7.2 in /usr/local/lib/python3.11/dist-packages (from geopandas) (0.10.0)
Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from geopandas) (24.2)
Requirement already satisfied: pyproj>=3.3.0 in /usr/local/lib/python3.11/dist-packages (from geopandas) (3.7.1)
Requirement already satisfied: shapely>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from geopandas) (2.1.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (4.57.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (1.4.8)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (11.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (3.2.3)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11/dist-packages (from matplotlib>=3.8.0->plotnine) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=2.2.0->plotnine) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=2.2.0->plotnine) (2025.2)
Requirement already satisfied: certifi in /usr/local/lib/python3.11/dist-packages (from pyogrio>=0.7.2->geopandas) (2025.1.31)
Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.11/dist-packages (from statsmodels>=0.14.0->plotnine) (1.0.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.7->matplotlib>=3.8.0->plotnine) (1.17.0)
DeprecationWarning: The argument `columns` for `DataFrame.pivot` is deprecated. It has been renamed to `on`.
<ipython-input-10-033859602bac>:23: DeprecationWarning: `DataFrame.melt` is deprecated. Use `unpivot` instead, with `index` instead of `id_vars` and `on` instead of `value_vars`
gdp_dropout = (meta.join(OOSR, on="country")
.group_by("country")
.agg([
pl.col("GDP per capita (constant 2015 US$)").mean().alias("GDP_per_capita"),
pl.col("obs_value").mean().alias("avg_obs_value")
]))
(ggplot(gdp_dropout, aes(x="avg_obs_value", y="GDP_per_capita"))
+ geom_point()
+ labs(x="Out of School Rate", y="GDP per capita (constant 2015 US$)")
+ theme_seaborn()
)/usr/local/lib/python3.11/dist-packages/plotnine/layer.py:364: PlotnineWarning: geom_point : Removed 2 rows containing missing values.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
metadata = meta
dropout = OOSR
metadata_pd = metadata.to_pandas()
dropout_pd = dropout.to_pandas()
# Filter dropout data for 'Total' sex
dropout_total = dropout_pd[dropout_pd["sex"] == "Total"]
# Calculate max dropout rate per country
dropout_max = dropout_total.groupby("country")["obs_value"].max().reset_index()
dropout_max.columns = ["country", "max_dropout"]
# Calculate median life expectancy per country
life_expectancy = metadata_pd.groupby("country")["Life expectancy at birth, total (years)"].median().reset_index()
life_expectancy.columns = ["country", "median_life_expectancy"]
# Merge both datasets
merged = pd.merge(dropout_max, life_expectancy, on="country", how="inner")
# Set up heatmap-style DataFrame
heatmap_df = merged.pivot_table(
index="country",
values="max_dropout",
columns="median_life_expectancy"
)
# Sort countries for readability
merged_sorted = merged.sort_values("median_life_expectancy", ascending=False)
# Plot heatmap (using just country as axis for simplicity)
plt.figure(figsize=(10, 8))
sns.heatmap(
data=merged_sorted.set_index("country")[["max_dropout"]],
cmap="YlOrRd",
annot=False,
fmt=".1f",
linewidths=0.5,
cbar_kws={'label': 'Max Dropout Rate (%)'},
xticklabels=False
)
plt.title("Heatmap of Max Dropout Rate vs Median Life Expectancy by Country")
plt.xlabel("Max Dropout Rate")
plt.ylabel("Country (sorted by life expectancy)")
plt.tight_layout(pad=0.5)
plt.show()import pandas as pd
import plotly.express as px
# Load data
metadata_pd = meta
dropout_pd = OOSR
metadata_pd = metadata.to_pandas()
dropout_pd = dropout.to_pandas()
# Filter dropout data for 'Total' sex
dropout_total = dropout_pd[dropout_pd["sex"] == "Total"]
# Median dropout rate per country
dropout_median = dropout_total.groupby("country")["obs_value"].median().reset_index()
dropout_median.columns = ["country", "median_dropout"]
# Aggregate metadata
metadata_grouped = metadata_pd.groupby("country").agg({
"Military expenditure (% of GDP)": "max",
"Life expectancy at birth, total (years)": "median",
"GDP per capita (constant 2015 US$)": "max"
}).reset_index()
# Merge
merged = pd.merge(metadata_grouped, dropout_median, on="country", how="inner")
merged = merged.rename(columns={
"Military expenditure (% of GDP)": "military_expenditure",
"Life expectancy at birth, total (years)": "life_expectancy",
"GDP per capita (constant 2015 US$)": "gdp_per_capita"
})
# Define income level
def income_level(gdp):
if gdp >= 45000:
return "High Income"
elif gdp >= 20000:
return "Middle Income"
else:
return "Low Income"
merged["income_level"] = merged["gdp_per_capita"].apply(income_level)
# Clean up
merged = merged.dropna(subset=["military_expenditure", "median_dropout", "life_expectancy"])
# Sunburst chart
fig = px.sunburst(
merged,
path=["income_level", "country"],
values="military_expenditure",
color="median_dropout",
color_continuous_scale="YlOrRd",
hover_data=["life_expectancy"],
labels={
"military_expenditure": "Military Expenditure (% of GDP)",
"median_dropout": "Dropout Rate (%)",
"life_expectancy": "Life Expectancy (Years)",
"income_level": "Income Level"
},
title="Military Expenditure Breakdown by Income Group<br>(Dropout Rate = Color, Life Expectancy on Hover)"
)
fig.update_traces(insidetextorientation='radial')
fig.show()import plotly.express as px
import pandas as pd
# Load metadata and dropout files
metadata = meta
dropout = OOSR
# Convert Polars DataFrames to pandas DataFrames
metadata_pd = metadata.to_pandas()
dropout_pd = dropout.to_pandas()
# Filter dropout data for 'Total' sex
dropout_total = dropout_pd[dropout_pd["sex"] == "Total"] # Use pandas filtering
# Calculate median dropout rate per country
dropout_median = dropout_total.groupby("country")["obs_value"].median().reset_index()
dropout_median.columns = ["country", "median_dropout"]
# Get max birth rate and life expectancy per country using pandas
metadata_grouped = metadata_pd.groupby("country").agg({
"Birth rate, crude (per 1,000 people)": "max",
"Life expectancy at birth, total (years)": "max",
"GDP per capita (constant 2015 US$)": "max"
}).reset_index()
# Merge with dropout data (both are now pandas DataFrames)
merged = pd.merge(metadata_grouped, dropout_median, on="country", how="inner")
# Rename for easier access
merged = merged.rename(columns={
"Birth rate, crude (per 1,000 people)": "birth_rate",
"Life expectancy at birth, total (years)": "life_expectancy",
"GDP per capita (constant 2015 US$)": "gdp_per_capita"
})
# Create income level calculated field
def income_level(gdp):
if gdp >= 45000:
return "High Income"
elif gdp >= 20000:
return "Middle Income"
else:
return "Low Income"
merged["income_level"] = merged["gdp_per_capita"].apply(income_level)
# Color mapping
color_map = {
"High Income": "green",
"Middle Income": "orange",
"Low Income": "gold"
}
merged["color"] = merged["income_level"].map(color_map)
fig = px.scatter(
merged,
x="birth_rate",
y="life_expectancy",
size="median_dropout",
color="income_level",
color_discrete_map={ # Custom color mapping
"High Income": "green",
"Middle Income": "orange",
"Low Income": "blue"
},
hover_name="country", # Show country name on hover
hover_data={ # Show additional data on hover
"birth_rate": True,
"life_expectancy": True,
"median_dropout": True,
"income_level": True
},
title="Max Birth Rate vs. Life Expectancy by Country (Interactive)<br>(Trendline: Linear Regression)",
labels={
"birth_rate": "Max Birth Rate (per 1,000 people)",
"life_expectancy": "Max Life Expectancy (years)",
"median_dropout": "Median Dropout Rate",
"income_level": "Income Level"
},
trendline="ols"
)
fig.show()